This example will use the following libraries.
library(biomod2)
library(corrplot)
library(MASS)
library(ggplot2)
library(ecospat)
library(gam)
The data were prepared in Trillium-VTNH-Data.Rmd and saved. The data object is called dat.
load("data/trillium_with_predictors.RData")
allVars <- raster::brick("data/allVars.grd")
I want to use the presence data for Trillium undulatum and there are some NAs in slope and aspect so I’ll get rid of those.
datsub <- subset(dat, species=="Trillium undulatum")
datsub <- na.omit(datsub)
There are a few covariates that I know I don’t want. Specifically, the wet and dry qtr variables because that is summer in some cells and winter in others. Also I’ll exclude Snow, since that is all 0. And I’ll exclude cells and species since I don’t use those ever. Finally, dominant land cover is a categorical variable so I will exclude that (to make my life easier).
datsub <- datsub[, !stringr::str_detect(colnames(datsub),"[.]wet[.]")]
datsub <- datsub[, !stringr::str_detect(colnames(datsub),"[.]dry[.]")]
datsub <- subset(datsub, select=c(-cells, -species, -Dominant.Land.Cover, -Snow))
First I will use the corrplot package to look at correlation.
tmpdat <-subset(datsub, select=stringr::str_detect(colnames(datsub), "precip"))
varCor <- cor(tmpdat, use = "na.or.complete")
corrplot::corrplot(varCor)
Many of the temperature variables are very correlated.
tmpdat <-subset(datsub, select=stringr::str_detect(colnames(datsub), "temp"))
varCor <- cor(tmpdat, use = "na.or.complete")
corrplot::corrplot(varCor)
After exploring the models, I came up with the following set of not too correlated variables that still explain much of the variability in presence/absence. But below I will also try variance inflation to select a set of non-collinear variables.
envvars <- c("mean.temp", "temp.diurnal.range", "temp.seasonality", "precip.warm.qtr", "precip.cold.qtr")
That gets me a set of variables that are not so horribly correlated.
tmpdat <- datsub[,envvars]
varCor <- cor(tmpdat, use = "na.or.complete")
corrplot::corrplot(varCor)
And the variance inflation factors look ok.
usdm::vif(tmpdat)
## Variables VIF
## 1 mean.temp 5.130822
## 2 temp.diurnal.range 1.435384
## 3 temp.seasonality 3.866922
## 4 precip.warm.qtr 5.469242
## 5 precip.cold.qtr 3.624185
I’ll test models with all variables and then these subsets.
topovars <- c("elevation", "slope", "aspect")
lcvars <- c("Tree.Cover", "mean.temp", "precip.warm.qtr" )
envvars <- c("mean.temp", "temp.diurnal.range", "temp.seasonality", "precip.warm.qtr", "precip.cold.qtr")
We set up a training set of presence data and a test set. kfold is just a function to randomly assign the data to k groups. I want just the presence data so will subset to pa column equal 1. I will call the presence only data presdat.
#set.seed(10)
presdat <- subset(datsub, pa==1)
group <- dismo::kfold(presdat, k=5) # 5 groups = 20 test/80 train split
The testing data will be group==1 and training data will be the rest.
pres_train <- presdat[group != 1, ]
pres_test <- presdat[group == 1, ]
Repeat for the background data.
bgdat <- subset(datsub, pa==0)
group <- dismo::kfold(bgdat, k=5)
backg_train <- bgdat[group != 1, ]
backg_test <- bgdat[group == 1, ]
We make separate presence and background train/test sets for evaluation purposes later. But for fitting we need a data frame with both train data sets (presence and background) together.
traindat <- rbind(pres_train, backg_train)
I’ll make a holder for the model output.
modellist <- list()
The full set of variables of highly correlated variables. As an experiment, I will use variable inflation to select a set of non-correlated variables.
vifres <- usdm::vifstep(subset(traindat, select=c(-pa, -lon, -lat)))
vifvars <- as.character(vifres@results$Variables)
vifvars
## [1] "temp.diurnal.range" "temp.seasonality"
## [3] "mean.temp.warm.qtr" "precip.seasonality"
## [5] "precip.warm.qtr" "precip.cold.qtr"
## [7] "slope" "aspect"
## [9] "Evergreen.Broadleaf.Trees" "Deciduous.Broadleaf.Trees"
## [11] "Shrubs" "Herbaceous"
## [13] "Cultivated" "Flooded"
## [15] "Urban" "Barren"
## [17] "Water"
Fit a generalized linear model with the VIF selected variables. Because I am using pa~. to select all variables, I need to make traindat.sub without the variables that should not be included.
traindat.sub <- traindat[,c("pa", vifvars)]
glmVIF <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat.sub)
Save.
modellist[["glmVIF"]] <- list(model=glmVIF, name="glmVIF", desc="GLM - all VIF variables", terms=vifvars)
That model has a lot of variables. Let’s try stepwise variable selection using AIC as criteria.
glmStep <- MASS::stepAIC(object=glmVIF,
scope=pa ~ .,
data = traindat.sub,
direction = "both",
trace = 1,
k = 2,
control=glm.control(maxit=500))
modellist[["glmStep"]] <- list(model=glmStep, name="glmStep", desc="GLM - variables selected w step AIC", terms=attr(glmStep$terms, "term.labels"))
Let’s look at the best model.
anova(glmStep)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: pa
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 4485 3077.6
## temp.seasonality 1 20.465 4484 3057.2
## mean.temp.warm.qtr 1 210.277 4483 2846.9
## precip.seasonality 1 0.003 4482 2846.9
## precip.warm.qtr 1 66.829 4481 2780.0
## precip.cold.qtr 1 17.621 4480 2762.4
## slope 1 16.898 4479 2745.5
## Deciduous.Broadleaf.Trees 1 3.847 4478 2741.7
## Herbaceous 1 0.543 4477 2741.1
## Cultivated 1 61.598 4476 2679.5
## Urban 1 2.175 4475 2677.4
Let’s look at the variable importance.
varimp <- biomod2::variables_importance(glmVIF, data=traindat)$mat
varimp[varimp>0.01,]
## temp.seasonality mean.temp.warm.qtr precip.seasonality
## 0.029681 0.031504 0.029078
## precip.warm.qtr precip.cold.qtr Deciduous.Broadleaf.Trees
## 0.355408 0.071371 0.010885
## Shrubs Cultivated Barren
## 0.514971 0.147519 0.077704
Fit a generalized linear model with all the environmental variables.
glmEnv <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa",envvars)])
modellist[["glmEnv"]] <- list(model=glmEnv, name="glmEnv", desc="GLM - environmental variables", terms=envvars)
We will compare to a topography only model.
glmTopo <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa", topovars)])
modellist[["glmTopo"]] <- list(model=glmTopo, name="glmTopo", desc="GLM - topographical variables", terms=topovars)
We will compare to a tree cover plus temperature and precipition model.
glmLC <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa", lcvars)])
modellist[["glmLC"]] <- list(model=glmLC, name="glmLC", desc="GLM - LC variables", terms=lcvars)
Here are the AICs so far.
unlist(lapply(modellist, function(x){AIC(x$model)}))
## glmVIF glmStep glmEnv glmTopo glmLC
## 2709.996 2699.358 2778.184 2804.955 2765.396
par(mfrow=c(2,3), mar=c(5, 4, 3, 2) + 0.1)
nm <- length(modellist)
for(i in names(modellist)){
erf.1 <- dismo::evaluate(pres_test, backg_test, model=modellist[[i]]$model)
plot(erf.1, 'ROC')
title(paste0(i,"\n\n"))
}
Let’s check model quality using the Boyce Index. First I will add the index to all my model lists.
for(i in names(modellist)){
pm <- predict(allVars,
modellist[[i]]$model,
type="response")
bindex <- ecospat::ecospat.boyce(pm, cbind(pres_test$lon, pres_test$lat), PEplot=FALSE)
modellist[[i]]$bindex <- bindex
}
Then I’ll make a data frame.
dfb <- data.frame(x=NULL, y=NULL, model=NULL)
for(i in names(modellist)){
bi <- modellist[[i]]$bindex
a <- data.frame(y=bi$F.ratio, x=bi$HS, model=i)
dfb <- rbind(dfb, a)
}
dfb$observed <- "yes"
dfb$observed[dfb$y==0] <- "no"
Now I can use that data frame with ggplot.
p <- ggplot(dfb, aes(x=x, y=y)) + geom_point(aes(col=observed)) +
ylab("Boyce Index") + xlab("Suitability")
p + facet_wrap(~model) +
ggtitle("Evaluation of the test data performance")
Our model with environmental variables is ok (it goes up as x increases). Our pure topographical model doesn’t look as good. Where there is high suitability there are no observations.
To plot predictions on the NH+VT map, I need to pass in the raster stack of the predictor variables and the model. I want to use type=response in order to get the probabilities (which is what a binomial fit returns).
pg.topo <- predict(allVars, glmTopo, type="response")
pg.env <- predict(allVars, glmEnv, type="response")
Now we can plot the prediction. Because I am making similar plots, I will make a function for my plots so they all look the same.
pm.plot <- function(x, main="", scale.min=0, scale.max=1, ..., mar=c(5,4,4,5)){
par(mar=mar)
plot(x, main=main, breaks=seq(scale.min, scale.max, (scale.max-scale.min)/10), col = terrain.colors(11),
xlab="Longitude", ylab="Latitude", cex=0.75, ...)
plot(nhvtshp, add=TRUE, border="blue")
plot(hbshp, add=TRUE)
}
Now make the prediction plots.
par(mfrow=c(1,2))
pm.plot(pg.topo, main='Topographical Variables')
pm.plot(pg.topo, main='Environmental Variables')
Let’s zoom in on Hubbard Brook.
par(mfrow=c(1,2))
xlims <- c(-71.9,-71.6)
ylims <- c(43.875,44)
pm.plot(pg.topo, main='Topographical Variables', xlim=xlims, ylim=ylims, scale.max=0.5)
pm.plot(pg.env, main='Environmental Variables', xlim=xlims, ylim=ylims, scale.max=0.5)
The GAM formula in R looks like response ~ s(var1, sm) + s(var2, sm). The s() is the spline function and allows the response to be non-linear. The second number, sm is the amount of smoothing and the default way you specify this is different for the gam::gam() function versus the mcgv::gam() function. Here I use gam::gam() and use the df argument (default). df=1 would be linear.
I will write a function to make my formula for the gam() call. That way I don’t have to make it manually.
gamfm <- function(x, df, f=NULL){
if(length(f)!=0) x<-x[x!=f]
fm <- paste0('s(', x, ', ', df, ')', collapse = ' + ')
if(length(f)!=0){
ff <- paste0(f, collapse = ' + ')
fm <- paste(fm,ff, sep="+")
}
fm <- as.formula(paste('pa ~', fm))
return(fm)
}
gamTopo2 <- gam::gam(formula = gamfm(topovars, 2),
data=traindat,
family="binomial")
gamTopo4 <- gam::gam(formula = gamfm(topovars, 4),
data=traindat,
family="binomial")
Save models.
mod <- paste0("gamTopo", c(2,4))
desc <- paste0("GAM - Topo variables df=", c(2,4))
for(i in 1:2){
pm <- predict(allVars, get(mod[i]), type="response")
bindex <- ecospat::ecospat.boyce(pm, cbind(pres_test$lon, pres_test$lat), PEplot=FALSE)
modellist[[mod[i]]] <- list(model=get(mod[i]), name=mod[i], desc=desc[i], terms=topovars, bindex = bindex)
}
gamLC2 <- gam::gam(formula = gamfm(lcvars,2),
data=traindat,
family="binomial")
gamLC4 <- gam::gam(formula = gamfm(lcvars,4),
data=traindat,
family="binomial")
Save models.
mod <- paste0("gamLC", c(2,4))
desc <- paste0("GAM - Tree Cover variables df=", c(2,4))
for(i in 1:2){
pm <- predict(allVars, get(mod[i]), type="response")
bindex <- ecospat::ecospat.boyce(pm, cbind(pres_test$lon, pres_test$lat), PEplot=FALSE)
modellist[[mod[i]]] <- list(model=get(mod[i]), name=mod[i], desc=desc[i], terms=topovars, bindex = bindex)
}
gamEnv2 <- gam::gam(formula = gamfm(envvars, 2),
data=traindat,
family="binomial")
gamEnv4 <- gam::gam(formula = gamfm(envvars, 4),
data=traindat,
family="binomial")
Save models.
mod <- paste0("gamEnv", c(2,4))
desc <- paste0("GAM - Environmental variables df=", c(2,4))
for(i in 1:2){
pm <- predict(allVars, get(mod[i]), type="response")
bindex <- ecospat::ecospat.boyce(pm, cbind(pres_test$lon, pres_test$lat), PEplot=FALSE)
modellist[[mod[i]]] <- list(model=get(mod[i]), name=mod[i], desc=desc[i], terms=topovars, bindex = bindex)
}
This one will use just 3 environmental variables.
minEnvVars <- c("precip.warm.qtr", "mean.temp", "temp.diurnal.range")
gamEnvMin <- gam::gam(formula = gamfm(minEnvVars, 4),
data=traindat,
family="binomial")
pm <- predict(allVars, gamEnvMin, type="response")
bindex <- ecospat::ecospat.boyce(pm, cbind(pres_test$lon, pres_test$lat), PEplot=FALSE)
modellist[["gamEnvMin"]] <- list(model=gamEnvMin, name="gamEnvMin", desc="GAM - Minimal", terms=minEnvVars, bindex = bindex)
Let’s look at the GAM effect size curves for the model with 5 environmental variables.
po <- gam:::preplot.Gam(gamEnv4, terms = attr(terms(gamEnv4), "term.labels"))
dfenv <- data.frame(x=NULL, y=NULL, se=NULL, variable=NULL)
for(i in names(po)){
vname <- stringr::str_replace(i, "s[(]", "")
vname <- stringr::str_replace(vname, ", 4[)]", "")
a <- data.frame(x=po[[i]]$x, y=po[[i]]$y, se=po[[i]]$se.y, variable=vname)
dfenv <- rbind(dfenv, a)
}
p <- ggplot(dfenv, aes(x=x, y=y)) + geom_line() +
geom_ribbon(aes(ymin=y+2*se, ymax=y-2*se), col="grey", alpha=0.5) +
ylab("effect size")
p + facet_wrap(~variable, scales="free")
Compare the topo, LC and environmenal predictions.
pg.topo <- predict(allVars, gamTopo4, type="response")
pg.env <- predict(allVars, gamEnv4, type="response")
pg.lc <- predict(allVars, gamLC4, type="response")
pg.min <- predict(allVars, gamEnvMin, type="response")
Now make the prediction plots.
par(mfrow=c(2,2))
mar <- c(0,0,2,0)
pm.plot(pg.topo, main='Topographical Variables', legend=FALSE, axes=FALSE, box=FALSE, mar=mar)
pm.plot(pg.env, main='Environmental Variables', legend=FALSE, axes=FALSE, box=FALSE, mar=mar)
pm.plot(pg.lc, main='Tree Cover Variables', legend=FALSE, axes=FALSE, box=FALSE, mar=mar)
pm.plot(pg.min, main='Three Env Variables', legend=FALSE, axes=FALSE, box=FALSE, mar=mar)
The environmental plot with the points.
par(mfrow=c(1,2))
pm.plot(pg.env, main='Environmental Variables')
pm.plot(pg.env, main='with observations')
psub <- subset(dat, species=="Trillium undulatum" & pa==1)
sp::coordinates(psub) <- c("lon", "lat")
plot(psub, pch=".", cex=1, col="red", add=TRUE)
aucs <- unlist(lapply(modellist,function(x){
dismo::evaluate(pres_test, backg_test, model=x$model)@auc
}))
sort(aucs)
## glmTopo gamTopo2 gamTopo4 glmEnv gamEnvMin glmLC glmStep glmVIF
## 0.7479959 0.7514587 0.7520620 0.7753430 0.7785331 0.7862645 0.7915496 0.7928306
## gamEnv2 gamLC2 gamEnv4 gamLC4
## 0.7961612 0.8012479 0.8048967 0.8197273
Let’s look at the Spearman correlations from the Boyce Index.
bis <- unlist(lapply(modellist,function(x){
x$bindex$Spearman.cor
}))
sort(bis)
## glmTopo gamEnvMin gamLC4 gamTopo2 gamTopo4 glmEnv glmVIF glmStep
## 0.847 0.895 0.897 0.914 0.920 0.933 0.942 0.945
## gamEnv4 gamEnv2 glmLC gamLC2
## 0.969 0.973 0.984 0.996
LC4 is highest though that one has no observations where the suitability is highest.
dfb <- data.frame(x=NULL, y=NULL, model=NULL)
for(i in c("gamEnv4","gamLC4","gamEnvMin")){
bi <- modellist[[i]]$bindex
a <- data.frame(y=bi$F.ratio, x=bi$HS, model=i)
dfb <- rbind(dfb, a)
}
dfb$observed <- "yes"
dfb$observed[dfb$y==0] <- "no"
p <- ggplot(dfb, aes(x=x, y=y)) + geom_point(aes(col=observed)) +
ylab("Boyce Index") + xlab("Suitability")
p + facet_wrap(~model, scales = "free_y") +
ggtitle("Evaluation of the test data performance")
Let’s zoom in on Hubbard Brook. The observations to the far right are next to the labs. The GLM makes the ridges (boundary) much higher in suitability than the lower elevation brook bottom (center). The model with tree cover also makes the ridge more suitable than the lower elevation.
par(mfrow=c(2,2))
xlims <- c(-71.9,-71.6)
ylims <- c(43.875,44)
mar <- c(0,0,3,0)
for(i in c("gamEnvMin", "gamEnv4", "gamLC4", "gamTopo4")){
pg <- predict(allVars, get(i), type="response")
pm.plot(pg, main=i, xlim=xlims, ylim=ylims, scale.max=0.5, box=FALSE, axes=FALSE, legend=FALSE, mar=mar)
plot(psub, pch=4, cex=1, col="red", add=TRUE)
}
Tree composition in Hubbard Brook also tracks these environmental conditions it looks like.
rp <- biomod2::response.plot2(models = c('glmTopo', 'gamTopo4'),
Data = traindat,
show.variables = topovars,
fixed.var.metric = 'mean', plot = FALSE, use.formal.names = TRUE)
For plotting, we’ll use ggplot2 and make a custom theme.
## define a custom ggplot2 theme
rp.gg.theme <- theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = .5),
panel.background = element_rect(fill = NA, colour = "gray70"),
strip.background = element_rect(fill = NA, colour = "gray70"),
panel.grid.major = element_line(colour = "grey90"),
legend.key = element_rect(fill = NA, colour = "gray70"))
Now we can plot.
gg.rp <- ggplot(rp, aes(x = expl.val, y = pred.val, lty = pred.name)) +
geom_line() + ylab("prob of occ") + xlab("") +
rp.gg.theme +
facet_grid(~ expl.name, scales = 'free_x') +
ggtitle("Topographical variables")
print(gg.rp)
Repeat the same for the environmental variables model. Not all these model have all the envvars terms.
rp <- biomod2::response.plot2(models = c('glmVIF', 'glmEnv', 'gamEnv4', 'gamLC4'),
Data = traindat,
show.variables = envvars,
fixed.var.metric = 'mean', plot = FALSE, use.formal.names = TRUE)
gg.rp <- ggplot(rp, aes(x = expl.val, y = pred.val, col = pred.name)) +
geom_line() + ylab("prob of occ") + xlab("") +
rp.gg.theme +
facet_wrap(~ expl.name, scales = 'free_x') +
ggtitle("Environmental variables")
print(gg.rp)
Boyce plots for all variables.
dfb <- data.frame(x=NULL, y=NULL, model=NULL)
for(i in names(modellist)){
bi <- modellist[[i]]$bindex
a <- data.frame(y=bi$F.ratio, x=bi$HS, model=i)
dfb <- rbind(dfb, a)
}
dfb$observed <- "yes"
dfb$observed[dfb$y==0] <- "no"
p <- ggplot(dfb, aes(x=x, y=y)) + geom_point(aes(col=observed)) +
ylab("Boyce Index") + xlab("Suitability")
p + facet_wrap(~model, scales = "free_y") +
ggtitle("Evaluation of the test data performance")
Compare AICs, Spearman Correlation for the models with seed = 10.
df <- data.frame(
name=unlist(lapply(modellist, function(x){x$name})),
Spearman=unlist(lapply(modellist, function(x){x$bindex$Spearman.cor})),
AUC=unlist(lapply(modellist,function(x){dismo::evaluate(pres_test, backg_test, model=x$model)@auc})),
AIC=unlist(lapply(modellist, function(x){AIC(x$model)}))
)
df$delAIC <- df$AIC-min(df$AIC)
df <- df[order(df$AIC),]
knitr::kable(df, row.names=FALSE)
| name | Spearman | AUC | AIC | delAIC |
|---|---|---|---|---|
| gamEnv4 | 0.969 | 0.8048967 | 2585.505 | 0.00000 |
| gamEnv2 | 0.973 | 0.7961612 | 2647.310 | 61.80438 |
| gamLC4 | 0.897 | 0.8197273 | 2690.918 | 105.41256 |
| glmStep | 0.945 | 0.7915496 | 2699.358 | 113.85266 |
| glmVIF | 0.942 | 0.7928306 | 2709.996 | 124.49035 |
| gamLC2 | 0.996 | 0.8012479 | 2730.106 | 144.60068 |
| gamEnvMin | 0.895 | 0.7785331 | 2737.029 | 151.52384 |
| gamTopo4 | 0.920 | 0.7520620 | 2763.145 | 177.63937 |
| glmLC | 0.984 | 0.7862645 | 2765.396 | 179.89125 |
| glmEnv | 0.933 | 0.7753430 | 2778.184 | 192.67862 |
| gamTopo2 | 0.914 | 0.7514587 | 2784.471 | 198.96568 |
| glmTopo | 0.847 | 0.7479959 | 2804.955 | 219.44968 |
I want to make sure that the selected variables are not changing a lot with different random test splits. I’ll make 20 random splits and look for variables that seem problematic (effect changes depending on the random data). I am going to select 50% of my data for training instead of 80%. I want to see how the variable selection and importance changes with different training data.
datlist <- list()
n=10
k=2
presdat <- subset(datsub, pa==1)
bgdat <- subset(datsub, pa==0)
for(i in 1:n){
group <- dismo::kfold(presdat, k=k) # 5 groups = 20 test/80 train split
pres_train <- presdat[group != 1, ]
pres_test <- presdat[group == 1, ]
group <- dismo::kfold(bgdat, k=k) # 3 groups = 25 test/75 train split
backg_train <- bgdat[group != 1, ]
backg_test <- bgdat[group == 1, ]
traindat <- rbind(pres_train, backg_train)
datlist[[i]] <- list(pres_train=pres_train, pres_test=pres_test,
backg_train=backg_train, backg_test=backg_test,
traindat=traindat)
}
modlist <- list()
for(i in 1:n){
gamEnv <- gam::gam(formula = gamfm(envvars, 4),
data=datlist[[i]]$traindat,
family="binomial")
modlist[[paste("split",i)]] <- gamEnv
}
df <- data.frame()
for(i in 1:n){
mod1 <- modlist[[i]]
rp <- biomod2::response.plot2(models = "mod1",
Data = datlist[[i]]$traindat,
show.variables = envvars,
fixed.var.metric = 'mean', plot = FALSE, use.formal.names = TRUE)
df <- rbind(df, cbind(rp, i=paste("samp", i)))
}
gg.rp <- ggplot(df, aes(x = expl.val, y = pred.val)) +
geom_line(aes(col=i)) + ylab("prob of occ") + xlab("") +
rp.gg.theme +
facet_wrap(~ expl.name, scales = 'free_x') +
ggtitle("Response curves across random samples") +
theme(legend.position = "none")
print(gg.rp)
df2 <- data.frame()
for(i in 1:n){
gamStart <- gam::gam(formula = gamfm(envvars, 4),
data=datlist[[i]]$traindat,
family="binomial")
varimp <- biomod2::variables_importance(gamStart, data=datlist[[i]]$traindat)$mat
varimp <- varimp[varimp[,1]!=0,]
df2 <- rbind(df2, data.frame(varimp=varimp, term=names(varimp), i=paste("samp", i)))
}
gg.rp <- ggplot(df2, aes(term, varimp)) +
geom_boxplot() + coord_flip() +
ggtitle("Variable importance across data sets")
print(gg.rp)
Now rerun the analysis with many different random splits of the data and see how variable the results are.
n <- 20
allmodels <- list()
for(mn in 1:n){
presdat <- subset(datsub, pa==1)
group <- dismo::kfold(presdat, k=5) # 5 groups = 20 test/80 train split
pres_train <- presdat[group != 1, ]
pres_test <- presdat[group == 1, ]
bgdat <- subset(datsub, pa==0)
group <- dismo::kfold(bgdat, k=5)
backg_train <- bgdat[group != 1, ]
backg_test <- bgdat[group == 1, ]
traindat <- rbind(pres_train, backg_train)
modellist <- list()
glmVIF <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa", vifvars)])
modellist[["glmVIF"]] <- list(model=glmVIF, name="glmVIF", desc="GLM - all VIF variables", terms=vifvars)
glmStep <- MASS::stepAIC(object=glmVIF,
scope=pa ~ .,
data = traindat.sub,
direction = "both",
trace = 1,
k = 2,
control=glm.control(maxit=500))
modellist[["glmStep"]] <- list(model=glmStep, name="glmStep", desc="GLM - variables selected w step AIC", terms=attr(glmStep$terms, "term.labels"))
glmLC <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa",lcvars)])
modellist[["glmLC"]] <- list(model=glmLC, name="glmLC", desc="GLM - LC variables", terms=lcvars)
glmTopo <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa",topovars)])
modellist[["glmTopo"]] <- list(model=glmTopo, name="glmTopo", desc="GLM - topo variables", terms=topovars)
glmEnv <- stats::glm(formula =pa ~ .,
family = binomial(link = "logit"),
data=traindat[,c("pa",envvars)])
modellist[["glmEnv"]] <- list(model=glmEnv, name="glmEnv", desc="GLM - environmental variables", terms=envvars)
gamEnvMin <- gam::gam(formula = gamfm(minEnvVars, 4),
data=traindat,
family="binomial")
modellist[["gamEnvMin"]] <- list(model=gamEnvMin, name="gamEnvMin", desc="GAM - Minimal", terms=minEnvVars)
gamTopo2 <- gam::gam(formula = gamfm(topovars, 2),
data=traindat,
family="binomial")
gamTopo4 <- gam::gam(formula = gamfm(topovars, 4),
data=traindat,
family="binomial")
mod <- paste0("gamTopo", c(2,4))
desc <- paste0("GAM - Topo variables df=", c(2,4))
for(i in 1:2){
modellist[[mod[i]]] <- list(model=get(mod[i]), name=mod[i], desc=desc[i], terms=topovars)
}
gamLC2 <- gam::gam(formula = gamfm(lcvars,2),
data=traindat,
family="binomial")
gamLC4 <- gam::gam(formula = gamfm(lcvars,4),
data=traindat,
family="binomial")
mod <- paste0("gamLC", c(2,4))
desc <- paste0("GAM - Tree Cover variables df=", c(2,4))
for(i in 1:2){
modellist[[mod[i]]] <- list(model=get(mod[i]), name=mod[i], desc=desc[i], terms=topovars)
}
gamEnv2 <- gam::gam(formula = gamfm(envvars, 2),
data=traindat,
family="binomial")
gamEnv4 <- gam::gam(formula = gamfm(envvars, 4),
data=traindat,
family="binomial")
mod <- paste0("gamEnv", c(2,4))
desc <- paste0("GAM - Environmental variables df=", c(2,4))
for(i in 1:2){
modellist[[mod[i]]] <- list(model=get(mod[i]), name=mod[i], desc=desc[i], terms=topovars)
}
for(i in names(modellist)){
pm <- predict(allVars,
modellist[[i]]$model,
type="response")
bindex <- ecospat::ecospat.boyce(pm, cbind(pres_test$lon, pres_test$lat), PEplot=FALSE)
modellist[[i]]$bindex <- bindex
modellist[[i]]$AIC <- AIC(modellist[[i]]$model)
modellist[[i]]$AUC <- dismo::evaluate(pres_test, backg_test, model=modellist[[i]]$model)@auc
modellist[[i]]$Spearman.cor <- modellist[[i]]$bindex$Spearman.cor
}
allmodels[[mn]] <- modellist
}
save(allmodels, file="allmodels.RData")
df2 <- data.frame()
for(i in 1:length(allmodels)){
minAIC <- min(unlist(lapply(allmodels[[i]], function(x){x$AIC})))
for(j in names(allmodels[[i]])){
mod <- allmodels[[i]][[j]]
df2 <- rbind(df2, data.frame(n=i,model=j,delAIC=mod$AIC-minAIC, Spearman.cor=mod$Spearman.cor, AUC=mod$AUC, AIC=mod$AIC))
}
}
df2$n <- as.factor(df$n)
df.allmodels <- df2
save(df.allmodels, allmodels, file="allmodels.RData")
save(df.allmodels, file="dfallmodels.RData")
library(tidyr)
load("dfallmodels.RData")
df.long <- pivot_longer(df.allmodels, c(-n, -model), names_to = "term", values_to = "value")
p <- ggplot(df.long, aes(model, value))
p + geom_boxplot() + coord_flip() + facet_wrap(~term, scale="free_x")